Experiment notes

  • Cell suspensions of meningeal dura enriched by MACS for cells expressing Cd45, Cd31, and Lyve-1
    • Cd45/Ptprc/B220/Ly-5/Lyt-4/T200: transmembrane protein tyrosine phosphatase, located on most haematopoietic cells, positive selection of leukocytes
    • Cd31/Pecam1: adhesion and signaling receptor that is expressed on endothelial and hematopoietic cells, positive selection of endothelial cells
    • Lyve1/1200012G08Rik/Lyve-1/Xlkd1/lymphatic vessel endothelial HA receptor-1: found primarily on lymphatic endothelial cells
  • The meningeal layer of the dura mater is a durable, dense fibrous membrane that passes through the foramen magnum and is continuous with the dura mater of the spinal cord. The meningeal layer of the dura mater creates several dural folds that divide the cranial cavity into freely communicating spaces.
  • Aimed for ~5000 cells per sample to be sequenced at least 50,000 reads per cell
    • Might ended up with more because we sequenced with the NovaSeq S2
  • Expecting to see a lot of leukocytes and vascular cells
  • The purpose is to check the effect of sex and Apoe isoform expression on the transcriptomes of the different cell subpopulations
  • 4 experimental groups/samples
    • E3_14M_F = APOE3 expressing; 14 months old; female
    • E3_14M_M = APOE3 expressing; 14 months old; male
    • E4_14M_F = APOE4 expressing; 14 months old; female
    • E4_14M_M = APOE4 expressing; 14 months old; male

Setup

Set working directory

knitr::opts_knit$set(root.dir = ".")

Load libraries

library(cowplot)     # plot_grid()
library(dplyr)       # left_join()
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)     # ggplot()
library(gridExtra)   # grid.arrange()
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(parallel)    # detectCores()
library(rtracklayer) # import()
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## Loading required package: GenomeInfoDb
library(scCustomize) # Merge_Seurat_List()
## Loading required package: Seurat
## Loading required package: SeuratObject
## Loading required package: sp
## 
## Attaching package: 'sp'
## The following object is masked from 'package:IRanges':
## 
##     %over%
## 
## Attaching package: 'SeuratObject'
## The following object is masked from 'package:GenomicRanges':
## 
##     intersect
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following object is masked from 'package:IRanges':
## 
##     intersect
## The following object is masked from 'package:S4Vectors':
## 
##     intersect
## The following object is masked from 'package:BiocGenerics':
## 
##     intersect
## The following objects are masked from 'package:base':
## 
##     intersect, t
## scCustomize v2.1.2
## If you find the scCustomize useful please cite.
## See 'samuel-marsh.github.io/scCustomize/articles/FAQ.html' for citation info.
library(Seurat)      # Read10X_h5()
library(stringr)     # str_match()

Variables and functions

# variables
out <- "../../results/pass_1/all_clusters/"
sample_order <- c("E3.M","E3.F","E4.M","E4.F")
sample_colors <- c("#26946A","#1814A1","#EAC941","#DF5F00")
sample_order2 <- c("Male E3", "Male E4", "Female E3", "Female E4")
isoform_order <- c("E4","E3")
isoform_colors <- c("darkgray","cornflowerblue")
sex_order <- c("Male","Female")
sex_colors <- c("darkgray","purple")

# thresholds
nCount.min <- 500
nCount.max <- 20000
nFeature.min <- 250
complexity.cutoff <- 0.8
mt.cutoff <- 20
ribo.cutoff <- 50
hb.cutoff <- 3

# functions
source("../../../functions/Kennedi_single_cell_functions.R")
saveToPDF <- function(...) {
    d = dev.copy(pdf,...)
    dev.off(d)
}

Load data

Merge h5

prefix <- "../../counts/"
suffix <- "/outs/filtered_feature_bc_matrix.h5"

if (file.exists("../../rObjects/raw_h5.rds")) {
  mouse <- readRDS("../../rObjects/raw_h5.rds")
} else {
  # individual sample objects
  E3.F <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E3_14M_F",suffix)))
  E3.M <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E3_14M_M",suffix)))
  E4.F <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E4_14M_F",suffix)))
  E4.M <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E4_14M_M",suffix)))

  # merge objects
  mouse <- merge(x = E3.F, 
                 y = c(E3.M,E4.F,E4.M),
                 add.cell.ids = c("E3.F","E3.M","E4.F","E4.M"), 
                 project = "Mouse Meningeal Dura scRNAseq")
  
  # cleanup and save
  remove(E3.F, E3.M, E4.F, E4.M)
  saveRDS(mouse, "../../rObjects/raw_h5.rds")
}

# preview
mouse
## An object of class Seurat 
## 32285 features across 33388 samples within 1 assay 
## Active assay: RNA (32285 features, 0 variable features)
##  4 layers present: counts.1, counts.2, counts.3, counts.4

Annotation

  • Annotation file was downloaded from 10x Genomics
    • refdata-gex-mm10-2020-A
    • provider: GENCODE
    • description: evidence-based annotation of the mouse genome (GRCm38), version M23 (Ensembl 98)
  • Gm* genes are originally annotated by MGI and the *Rik genes are annotated by RIKEN
# read in annotation file, GENCODE GRCm38 version M23 (Ensembl 98)
if (file.exists("../../rObjects/annotation.rds")) {
  genes <- readRDS("../../rObjects/annotation.rds")
} else {
  gtf.file <- "../../refs/mouse_genes.gtf"
  genes <- rtracklayer::import(gtf.file)
  genes <- as.data.frame(genes)
  genes <- genes[genes$type == "gene",]
  saveRDS(genes, "../../rObjects/annotation.rds")
}

Metadata columns

# create sample column
barcodes <- colnames(mouse)
pattern <- "(.+)_[ACGT]+-(\\d+)"
sample <- str_match(barcodes, pattern)[,2]
table(sample)
## sample
##  E3.F  E3.M  E4.F  E4.M 
##  6747  8029 11204  7408
mouse$sample <- factor(sample, levels = sample_order)
table(mouse$sample)  # check
## 
##  E3.M  E3.F  E4.M  E4.F 
##  8029  6747  7408 11204
Idents(mouse) <- mouse$sample

# age column
mouse$age <- "14 months"

# sex column
sex <- str_match(mouse$sample, "E\\d.([FM])")[,2]
sex <- gsub("F","Female",sex)
sex <- gsub("M","Male",sex)
mouse$sex <- factor(sex, levels = sex_order)

# Apoe isoform column
isoform <- str_match(mouse$sample, "(E\\d).[FM]")[,2]
mouse$isoform <- factor(isoform, levels = isoform_order)

# sample2 column (for title formatting)
mouse$sample2 <- paste(mouse$sex, mouse$isoform, sep = " ")

# cell.complexity
mouse$cell.complexity <- log10(mouse$nFeature_RNA) / log10(mouse$nCount_RNA)

# percent.mt
mt.genes <- genes[genes$seqnames == "chrM",13]
mouse$percent.mt <- PercentageFeatureSet(mouse, features = mt.genes)
mt.genes
##  [1] "mt-Nd1"  "mt-Nd2"  "mt-Co1"  "mt-Co2"  "mt-Atp8" "mt-Atp6" "mt-Co3" 
##  [8] "mt-Nd3"  "mt-Nd4l" "mt-Nd4"  "mt-Nd5"  "mt-Nd6"  "mt-Cytb"
# percent.ribo
# ribosomal proteins begin with 'Rps' or 'Rpl' in this annotation file
# mitochondrial ribosomes start with 'Mrps' or 'Mrpl'
gene.names <- genes$gene_name
ribo <- gene.names[grep("^Rp[sl]", gene.names)]
mt.ribo <- gene.names[grep("^Mrp[sl]", gene.names)]
ribo.combined <- c(mt.ribo,ribo)
mouse$percent.ribo.protein <- PercentageFeatureSet(mouse, features = ribo.combined)
ribo.combined
##   [1] "Mrpl15"     "Mrpl30"     "Mrps9"      "Mrpl44"     "Mrps14"    
##   [6] "Mrpl41"     "Mrps2"      "Mrps5"      "Mrps26"     "Mrps28"    
##  [11] "Mrpl47"     "Mrpl24"     "Mrpl9"      "Mrps21"     "Mrpl50"    
##  [16] "Mrpl37"     "Mrps15"     "Mrpl20"     "Mrpl33"     "Mrpl1"     
##  [21] "Mrps18c"    "Mrps17"     "Mrps33"     "Mrpl35"     "Mrpl19"    
##  [26] "Mrpl53"     "Mrps25"     "Mrpl51"     "Mrps35"     "Mrps12"    
##  [31] "Mrpl46"     "Mrps11"     "Mrpl48"     "Mrpl17"     "Mrpl23"    
##  [36] "Mrps31"     "Mrpl34"     "Mrpl4"      "Mrps22"     "Mrpl3"     
##  [41] "Mrpl54"     "Mrpl42"     "Mrps24"     "Mrpl22"     "Mrpl55"    
##  [46] "Mrps23"     "Mrpl27"     "Mrpl10"     "Mrpl45"     "Mrpl58"    
##  [51] "Mrps7"      "Mrpl38"     "Mrpl12"     "Mrpl32"     "Mrpl36"    
##  [56] "Mrps27"     "Mrps36"     "Mrps30"     "Mrps16"     "Mrpl52"    
##  [61] "Mrpl57"     "Mrpl13"     "Mrpl40"     "Mrpl39"     "Mrps6"     
##  [66] "Mrpl18"     "Mrps34"     "Mrpl28"     "Mrps18b"    "Mrpl14"    
##  [71] "Mrps18a"    "Mrpl2"      "Mrps10"     "Mrpl21"     "Mrpl11"    
##  [76] "Mrpl49"     "Mrpl16"     "Mrpl43"     "Rpl7"       "Rpl31"     
##  [81] "Rpl37a"     "Rps6kc1"    "Rpl7a"      "Rpl12"      "Rpl35"     
##  [86] "Rps21"      "Rpl22l1"    "Rps3a1"     "Rps27"      "Rpl34"     
##  [91] "Rps20"      "Rps6"       "Rps8"       "Rps6ka1"    "Rpl11"     
##  [96] "Rpl22"      "Rpl9"       "Rpl5"       "Rplp0"      "Rpl6"      
## [101] "Rpl21"      "Rpl32"      "Rps9"       "Rpl28"      "Rps5"      
## [106] "Rps19"      "Rps16"      "Rps11"      "Rpl13a"     "Rpl18"     
## [111] "Rps17"      "Rps3"       "Rpl27a"     "Rps13"      "Rps15a"    
## [116] "Rplp2"      "Rpl18a"     "Rpl13"      "Rps25"      "Rpl10-ps3" 
## [121] "Rplp1"      "Rpl4"       "Rps27l"     "Rpl29"      "Rps27rt"   
## [126] "Rpsa"       "Rpl14"      "Rps12"      "Rps15"      "Rpl41"     
## [131] "Rps26"      "Rps27a"     "Rpl26"      "Rpl23a"     "Rpl9-ps1"  
## [136] "Rps6kb1"    "Rpl23"      "Rpl19"      "Rpl27"      "Rpl38"     
## [141] "Rps7"       "Rpl10l"     "Rps29"      "Rpl36al"    "Rps6kl1"   
## [146] "Rps6ka5"    "Rps23"      "Rpl15"      "Rps24"      "Rpl36a-ps1"
## [151] "Rpl37"      "Rpl30"      "Rpl8"       "Rpl3"       "Rps19bp1"  
## [156] "Rpl39l"     "Rpl35a"     "Rpl24"      "Rps6ka2"    "Rps2"      
## [161] "Rpl3l"      "Rps10"      "Rpl10a"     "Rps28"      "Rps18"     
## [166] "Rpl7l1"     "Rpl36"      "Rpl36-ps4"  "Rps14"      "Rpl17"     
## [171] "Rps6kb2"    "Rps6ka4"    "Rpl9-ps6"   "Rpl39"      "Rpl10"     
## [176] "Rps4x"      "Rps6ka6"    "Rpl36a"     "Rps6ka3"
# percent.hb
# percent.hb - hemoglobin proteins begin with 'Hbb' or 'Hba' for mouse
hb.genes <- gene.names[grep("^Hb[ba]-", gene.names)]
mouse$percent.hb <- PercentageFeatureSet(mouse, features = hb.genes)
hb.genes
## [1] "Hbb-bt"  "Hbb-bs"  "Hbb-bh2" "Hbb-bh1" "Hbb-y"   "Hba-x"   "Hba-a1" 
## [8] "Hba-a2"

Pre-filtering QC

Number of cells

# Visualize the number of cell counts per sample
data <- as.data.frame(table(mouse$sample))
colnames(data) <- c("sample","frequency")

ncells1 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency), 
            position=position_dodge(width=0.9), 
            vjust=-0.25) +
  scale_fill_manual(values = sample_colors) + 
  scale_y_continuous(breaks = seq(0,12000, by = 2000), limits = c(0,12000)) +
  ggtitle("Raw: cells per sample") +
  theme(legend.position =  "none") + 
  theme(axis.text.x = element_text(angle = 45, hjust=1))
ncells1

Density plots

# Visualize nCount_RNA
den1 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = nCount_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("nCount_RNA") +
  ylab("Density") +
  theme(legend.position =  "none") +
  geom_vline(xintercept = nCount.min, linetype = "dashed", colour = "red") +
  geom_vline(xintercept = nCount.max, linetype = "dashed", colour = "red") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize nFeature_RNA
den2 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = nFeature_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("nFeature_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nFeature.min, linetype = "dashed", colour = "red") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize cell complexity
# Quality cells are usually above 0.85
den3 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = cell.complexity,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("Cell Complexity (log10(nFeature/nCount))") +
  ylab("Density") +
  geom_vline(xintercept = complexity.cutoff, linetype = "dashed", colour = "red") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.mt
den4 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = percent.mt,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  geom_vline(xintercept = mt.cutoff, linetype = "dashed", colour = "red") +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Mitochondrial Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.ribo.protein
den5 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = percent.ribo.protein,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Ribosomal Protein Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.hb
den6 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = percent.hb,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  geom_vline(xintercept = hb.cutoff, linetype = "dashed", colour = "red") +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Hemoglobin Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Arrange graphs in grid
plots <- list(den1,den2,den3,den4,den5,den6)
layout <- rbind(c(1,4),c(2,5),c(3,6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 1756 rows containing non-finite outside the scale range
## (`stat_density()`).

Violin plots

# normalize data for violin plots
mouse <- NormalizeData(mouse, assay = "RNA")
## Normalizing layer: counts.1
## Normalizing layer: counts.2
## Normalizing layer: counts.3
## Normalizing layer: counts.4
# nFeature, nCount, and cell.complexity violins
v1 <- VlnPlot(mouse,
              features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
              ncol = 3,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
v1

#  percent violins
v2 <- VlnPlot(mouse,
              features = c("percent.mt","percent.ribo.protein","percent.hb"),
              ncol = 3,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
v2

Scatter plots

s1 <- ggplot(
  mouse@meta.data,
  aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) + 
  geom_point() + 
  geom_smooth(method = "lm") +
    scale_x_log10() +       
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = nCount.min, linetype = "dashed", colour = "red") + 
  geom_hline(yintercept = nFeature.min, linetype = "dashed", colour = "red") + 
  facet_wrap(~sample) +
  scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
s1
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

s2 <- FeatureScatter(mouse,
                     feature1 = "nCount_RNA",
                     feature2 = "percent.mt",
                     group.by = 'sample',
                     cols = sample_colors,
                     shuffle = TRUE)
s2

Filtering

Cell-level filtering

# filter
mouse.filtered <- subset(mouse,
                         subset = (nCount_RNA > nCount.min) &
                           (nCount_RNA < nCount.max) &
                           (nFeature_RNA > nFeature.min) &
                           (cell.complexity > complexity.cutoff) &
                           (percent.mt < mt.cutoff) &
                           (percent.hb < hb.cutoff))

# print cells removed
print(paste0(dim(mouse)[2] - dim(mouse.filtered)[2]," cells removed"))
## [1] "12151 cells removed"

Gene-level filtering

Remove lowly expressed genes. We will keep genes that have at least 1 count in 10 cells.

# filter genes
mouse.filtered <- JoinLayers(mouse.filtered)
counts <- GetAssayData(object = mouse.filtered, layer = "counts")
nonzero <- counts > 0  # produces logical
keep <- Matrix::rowSums(nonzero) >= 10  # sum the true/false
counts.filtered <- counts[keep,]  # keep certain genes

# overwrite mouse.filtered
mouse.filtered <- CreateSeuratObject(counts.filtered, 
                                     meta.data = mouse.filtered@meta.data)

# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "10587 features removed"

Remove highly abundant mitochondrial transcripts.

# remove mt.genes
counts <- GetAssayData(object = mouse.filtered, layer = "counts")
keep <- !rownames(counts) %in% mt.genes # false when mt.gene
counts.filtered <- counts[keep,]

# overwrite mouse.filtered
mouse.filtered <- CreateSeuratObject(counts.filtered,
                                     meta.data = mouse.filtered@meta.data)

# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "13 features removed"
# cleanup data
remove(mouse,counts,counts.filtered,nonzero)

Post-filtering QC

Number of cells

# Visualize the number of cell counts per sample
data <- as.data.frame(table(mouse.filtered$sample))
colnames(data) <- c("sample","frequency")

ncells2 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency), 
            position=position_dodge(width=0.9), 
            vjust=-0.25) +
  scale_fill_manual(values = sample_colors) + 
  scale_y_continuous(breaks = seq(0,12000, by = 2000), limits = c(0,12000)) +
  ggtitle("Filtered: cells per sample") +
  theme(legend.position =  "none") + 
  theme(axis.text.x = element_text(angle = 45, hjust=1))

# Arrange graphs in grid
plots <- list(ncells1,ncells2)
layout <- rbind(c(1),c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

Density plots

# Visualize nCount_RNA
den1 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = nCount_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("nCount_RNA") +
  ylab("Density") +
  theme(legend.position =  "none") +
  geom_vline(xintercept = nCount.min, linetype = "dashed", colour = "red") +
  geom_vline(xintercept = nCount.max, linetype = "dashed", colour = "red") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize nFeature_RNA
den2 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = nFeature_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("nFeature_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nFeature.min, linetype = "dashed", colour = "red") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize cell complexity
# Quality cells are usually above 0.85
den3 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = cell.complexity,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("Cell Complexity (log10(nFeature/nCount))") +
  ylab("Density") +
  geom_vline(xintercept = complexity.cutoff, linetype = "dashed", colour = "red") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.mt
den4 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = percent.mt,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  geom_vline(xintercept = mt.cutoff, linetype = "dashed", colour = "red") +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Mitochondrial Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.ribo.protein
den5 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = percent.ribo.protein,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Ribosomal Protein Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.hb
den6 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = percent.hb,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  geom_vline(xintercept = hb.cutoff, linetype = "dashed", colour = "red") +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Hemoglobin Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Arrange graphs in grid
plots <- list(den1,den2,den3,den4,den5,den6)
layout <- rbind(c(1,4),c(2,5),c(3,6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 1327 rows containing non-finite outside the scale range
## (`stat_density()`).

Violin plots

# normalize data before violin plots
mouse.filtered <- NormalizeData(mouse.filtered)
## Normalizing layer: counts
# nFeature, nCount, and cell.complexity violins
v3 <- VlnPlot(mouse.filtered,
              features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
              ncol = 3,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
v3

#  percent violins
v4 <- VlnPlot(mouse.filtered,
              features = c("percent.mt","percent.ribo.protein","percent.hb"),
              ncol = 3,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
v4

Scatter plots

s3 <- ggplot(
  mouse.filtered@meta.data,
  aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) + 
  geom_point() + 
  geom_smooth(method = "lm") +
    scale_x_log10() +       
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = nCount.min, linetype = "dashed", colour = "red") + 
  geom_hline(yintercept = nFeature.min, linetype = "dashed", colour = "red") + 
  facet_wrap(~sample) +
  scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
s3
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

s4 <- FeatureScatter(mouse.filtered,
                     feature1 = "nCount_RNA",
                     feature2 = "percent.mt",
                     group.by = 'sample',
                     cols = sample_colors,
                     shuffle = TRUE)
s4

Box plot

# Visualize the distribution of genes detected per cell via boxplot
b1 <- ggplot(mouse.filtered@meta.data,
       aes(x = sample, 
           y = log10(nFeature_RNA), 
           fill=sample)) + 
  geom_boxplot() + 
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  theme(plot.title = element_text(hjust = 0.5, face="bold")) +
  ggtitle("Unique Genes / Cell / Sample") +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("Sample")
b1

Top transcripts

df <- data.frame(gene_name = rownames(mouse.filtered))
df$rsum <- rowSums(x = mouse.filtered, slot = "counts")
df <- df[order(df$rsum, decreasing = TRUE),]
rownames(df) <- 1:nrow(df)
head(df, 10)
##    gene_name     rsum
## 1    Gm42418 11194504
## 2     Malat1  3490427
## 3      Cmss1   741422
## 4     Tmsb4x   732027
## 5     S100a9   630464
## 6     Camk1d   590094
## 7     S100a8   456591
## 8       Cd74   433221
## 9       Igkc   426296
## 10  AY036118   396173

Unwanted variation

Cell cycle

markers <- "../../../functions/cell_cycle_markers.tsv"
path <- paste0(out, "unwanted_variation")
mouse.filtered[["phase"]] <- cell_cycle_QC(obj = mouse.filtered,
                                           markersPath = markers,
                                           species = "mouse",
                                           outDir = path)

Mitochondria

mouse.filtered[["mito.factor"]] <- mitochondria_QC(obj = mouse.filtered,
                                                   outDir = path)

Cluster

SCTransform

  • SCTransform method is a more accurate method of normalizing, estimating the variance of the raw filtered data, and identifying the most variable genes.
  • Variation in sequencing depth (total nCount_RNA per cell) is normalized using a regularized negative binomial model.
  • If there are other sources of uninteresting variation it can be included.
# split
mouse.filtered[["RNA"]] <- split(mouse.filtered[["RNA"]], 
                                 f = mouse.filtered$sample)

# transform
mouse.filtered <- SCTransform(mouse.filtered, verbose = FALSE)
  • NOTE: By default, after normalizing, adjusting the variance, and regressing out uninteresting sources of variation, SCTransform will rank the genes by residual variance and output the 3000 most variant genes. If the data set has larger cell numbers, then it may be beneficial to adjust this parameter higher using the variable.features.n argument.
  • It is suggested to not regress out batch, and instead use a data integration method like Harmony

PCA

# run PCA on the merged object
mouse.filtered <- RunPCA(object = mouse.filtered, assay = "SCT")
## Warning in PrepDR(object = object, features = features, verbose = verbose): The
## following 13 features requested have not been scaled (running reduction without
## them): Ighg1, Rag1, Igll1, Prss34, Gm30948, Igkv6-15, Gm30613, Slc26a4,
## Atp6v1c2, Fut9, Ighv1-80, Igkv8-19, Tnn
## PC_ 1 
## Positive:  Mrc1, Cd74, Lyz2, F13a1, Slc9a9, Arhgap15, Bank1, S100a9, H2-Eb1, C1qa 
##     S100a8, Slc8a1, Apoe, H2-Aa, H2-Ab1, Skap1, Mctp1, C1qb, Pid1, Ptprc 
##     Inpp4b, C1qc, Rbpj, Pf4, Kcnq5, Rnf150, Zeb2, Dock2, Csf1r, Ctsc 
## Negative:  Gpc6, Flt1, Ptprg, Ldb2, Foxp2, Bnc2, Cdh11, Slc4a10, Ptprb, Igfbp5 
##     Pcdh7, Auts2, Ptprm, Plcb1, Mir100hg, Prkg1, Plpp3, Cacna1c, Cyyr1, Col12a1 
##     Tenm3, Igfbp7, Emcn, Ptprd, Nfib, Mecom, Stox2, 9530026P05Rik, Pdgfd, Adamts9 
## PC_ 2 
## Positive:  Gpc6, Cdh11, Foxp2, Slc4a10, Bnc2, Igfbp5, Col12a1, Tenm3, Mir100hg, Cacna1c 
##     Col1a2, Ptprd, Slit2, Zfhx4, Slc38a2, Epha3, Prrx1, Lama2, Pdzrn3, Col1a1 
##     Egfr, Slc47a1, Ank2, Tspan11, Boc, Map1b, Naaladl2, Eya2, Snhg11, Adam12 
## Negative:  Flt1, Ldb2, Ptprg, Ptprb, Plcb1, Cyyr1, Emcn, Igfbp3, Adgrl4, Kdr 
##     Ptprm, Stox2, Plpp1, Podxl, Plvap, Mecom, Prex2, Arl15, Col4a1, Adgrf5 
##     Dysf, Igfbp7, Pecam1, Sema6a, Col4a2, Samd12, Rapgef4, Hmcn1, Insr, Etl4 
## PC_ 3 
## Positive:  Mrc1, F13a1, Slc9a9, Apoe, C1qa, Pid1, Dab2, Slc8a1, Rbpj, Rnf150 
##     Cd74, Stab1, C1qb, Pf4, C1qc, Ctsc, Zdhhc14, Csf1r, Ccl8, Bank1 
##     H2-Eb1, Cd163, Adgre1, Maf, H2-Aa, Zeb2, Itsn1, Fcrls, Ms4a7, H2-Ab1 
## Negative:  S100a9, S100a8, Retnlg, Lcn2, Mmp8, Wfdc21, Ngp, Pglyrp1, Ltf, S100a6 
##     Camp, Hp, Mmp9, Ifitm6, Anxa1, Cxcr2, Cd177, Gda, Hdc, Pygl 
##     Gsr, S100a11, Abca13, Mxd1, Chil3, Slfn4, Csf3r, Dach1, Ly6g, Slpi 
## PC_ 4 
## Positive:  Kcnq5, Skap1, Inpp4b, Aff3, Pax5, Bach2, Ikzf3, Tmem163, Cd79a, St6galnac3 
##     Il7r, Ms4a1, Ebf1, Ly6d, Tox, Agbl1, Cacna1e, Itk, Il1rl1, Prkcq 
##     Arhgap24, Bcl11b, Dach2, Bcl11a, Cd79b, Gata3, Gm2682, Bcl2, Chst3, Vpreb3 
## Negative:  Mrc1, F13a1, Lyz2, Apoe, C1qa, Pid1, Dab2, Slc8a1, Slc9a9, Pf4 
##     C1qb, Mctp1, C1qc, Stab1, Rbpj, Rnf150, Ccl8, Csf1r, Aoah, Cd163 
##     Adgre1, Trf, Ctsc, Fcrls, Ms4a7, Selenop, S100a9, Itsn1, Ctsb, S100a8 
## PC_ 5 
## Positive:  Skap1, Inpp4b, St6galnac3, Il1rl1, Kcnq5, Tox, Dach2, Itk, Il7r, Prkcq 
##     Bcl11b, Gata3, Gm2682, Tnik, Themis, Large1, Cd226, Camk4, Ikzf2, 1700113H08Rik 
##     Plxdc2, Icos, Ptpn13, Ms4a4b, Chn2, Pard3, Rnf128, Cd247, Plcl1, Pde7b 
## Negative:  Pax5, Aff3, Cd79a, Tmem163, Bank1, Ms4a1, Ebf1, Bach2, Ly6d, Agbl1 
##     Arhgap24, Cd79b, Bcl11a, Cacna1e, Vpreb3, Chst3, Klhl14, Cecr2, Ikzf3, Gm30211 
##     Btla, Iglc3, Ralgps2, Iglc2, Fcmr, Siglecg, Gm15987, Pou2af1, Cd74, Ighm
# Reset idents and levels
DefaultAssay(mouse.filtered) <- "SCT"
Idents(mouse.filtered) <- "sample"
mouse.filtered$sample <- factor(mouse.filtered$sample, 
                              levels = sample_order)
mouse.filtered$sex <- factor(mouse.filtered$sex, 
                           levels = sex_order)
mouse.filtered$isoform <- factor(mouse.filtered$isoform,
                               levels = isoform_order)

# Plot PCA
pca1 <- DimPlot(mouse.filtered,
                reduction = "pca",
                split.by = "sample",
                group.by = "sample",
                cols = sample_colors,
                ncol = 2)
pca1

pca2 <- DimPlot(mouse.filtered,
                reduction = "pca",
                split.by = "isoform",
                group.by = "isoform",
                cols = isoform_colors)
pca2

pca3 <- DimPlot(mouse.filtered,
                reduction = "pca",
                split.by = "sex",
                group.by = "sex",
                cols = sex_colors)
pca3

pca4 <- DimPlot(mouse.filtered,
                reduction = "pca",
                group.by = "sample",
                cols = sample_colors,
                shuffle = TRUE,
                raster = FALSE)
pca4

e1 <- ElbowPlot(mouse.filtered) +
  geom_vline(xintercept = 15, linetype = "dashed", colour = "red")
e1

Harmony

# integrate
mouse.filtered <- IntegrateLayers(mouse.filtered,
                                  method = "HarmonyIntegration",
                                  assay = "SCT",
                                  orig.reduction = "pca")
## Transposing data matrix
## Using automatic lambda estimation
## Initializing state using k-means centroids initialization
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony 4/10
## Harmony converged after 4 iterations
# re-join layers
mouse.filtered[["RNA"]] <- JoinLayers(mouse.filtered[["RNA"]])
hrm1 <- DimPlot(mouse.filtered,
                reduction = "harmony",
                split.by = "sample",
                group.by = "sample",
                cols = sample_colors,
                ncol = 2)
hrm1

hrm2 <- DimPlot(mouse.filtered,
                reduction = "harmony",
                split.by = "isoform",
                group.by = "isoform",
                cols = isoform_colors)
hrm2

hrm3 <- DimPlot(mouse.filtered,
                reduction = "harmony",
                split.by = "sex",
                group.by = "sex",
                cols = sex_colors)
hrm3

hrm4 <- DimPlot(mouse.filtered,
                reduction = "harmony",
                group.by = "sample",
                cols = sample_colors,
                shuffle = TRUE,
                raster = FALSE)
hrm4

# Plot elbow
mouse.filtered@reductions$harmony@stdev <-
  apply(mouse.filtered@reductions$harmony@cell.embeddings, 2, sd)
e1 <- ElbowPlot(mouse.filtered,
                reduction = "harmony") +
  geom_vline(xintercept = 15, linetype = "dashed", color = "red")
e1

UMAP

To overcome the extensive technical noise in the expression of any single gene for scRNA-seq data, Seurat assigns cells to clusters based on their PCA scores derived from the expression of the integrated most variable genes, with each PC essentially representing a “metagene” that combines information across a correlated gene set. Determining how many PCs to include in the clustering step is therefore important to ensure that we are capturing the majority of the variation, or cell types, present in our dataset.

# run UMAP
mouse.filtered <- RunUMAP(mouse.filtered,
                          dims = 1:15,
                          reduction = "harmony",
                          assay = "SCT",
                          n.components = 3) # set to 3 to use with VR
## 15:50:35 UMAP embedding parameters a = 0.9922 b = 1.112
## 15:50:35 Read 21237 rows and found 15 numeric columns
## 15:50:35 Using Annoy for neighbor search, n_neighbors = 30
## 15:50:35 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 15:50:38 Writing NN index file to temp file /tmp/RtmpborP80/file278b265477b7d4
## 15:50:38 Searching Annoy index using 1 thread, search_k = 3000
## 15:50:46 Annoy recall = 100%
## 15:50:47 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 15:50:49 Initializing from normalized Laplacian + noise (using RSpectra)
## 15:50:50 Commencing optimization for 200 epochs, with 877344 positive edges
## 15:50:59 Optimization finished
# plot UMAP
DimPlot(mouse.filtered,
        cols = sample_colors,
        shuffle = TRUE)

Clusters

  • Seurat uses a graph-based clustering approach, which embeds cells in a graph structure, using a K-nearest neighbor (KNN) graph. By default, it uses k = 30.
  • Then, it creates a shared nearest neighbor (SNN) graph and identifies communities
  • Increased resolution leads to a greater number of clusters
# Determine the K-nearest neighbor graph
mouse.unannotated <- FindNeighbors(object = mouse.filtered,
                                   assay = "SCT", # set as default after SCTransform
                                   reduction = "harmony",
                                   dims = 1:15)
## Computing nearest neighbor graph
## Computing SNN
# Determine the clusters for various resolutions
mouse.unannotated <- FindClusters(object = mouse.unannotated,
                                  algorithm = 1, # 1 = Louvain
                                  resolution = seq(0.1, 0.5, by=0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 21237
## Number of edges: 656664
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9777
## Number of communities: 12
## Elapsed time: 2 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 21237
## Number of edges: 656664
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9677
## Number of communities: 15
## Elapsed time: 2 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 21237
## Number of edges: 656664
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9601
## Number of communities: 18
## Elapsed time: 2 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 21237
## Number of edges: 656664
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9534
## Number of communities: 21
## Elapsed time: 2 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 21237
## Number of edges: 656664
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9475
## Number of communities: 26
## Elapsed time: 2 seconds
mouse.unannotated$seurat_clusters <- mouse.unannotated$SCT_snn_res.0.5

Explore resolutions

# 0.1
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.1",
        label = TRUE)

# 0.2
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.2",
        label = TRUE)

# 0.3
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.3",
        label = TRUE)

# 0.4
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.4",
        label = TRUE)

# 0.5
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.5",
        label = TRUE)